# -*- coding: utf-8 -*-
import copy
import random
import scrapy
from scrapy import Request
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.project import get_project_settings
from datetime import datetime
from zc_core.spiders.base import BaseSpider
from plap.items import ItemDataLog
from zc_core.dao.batch_dao import BatchDao
from zc_core.dao.spu_pool_dao import SpuPoolDao
from zc_core.model.items import Box
from zc_core.util.batch_gen import time_to_batch_no
from zc_core.util.done_filter import DoneFilter
from zc_core.util.http_util import retry_request
from plap.rules import parse_item_data
from plap.utils.login import SeleniumLogin


class FullSpider(BaseSpider):
    name = 'full'
    custom_settings = {
        'CONCURRENT_REQUESTS': 8,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 8,
        'CONCURRENT_REQUESTS_PER_IP': 8,
    }
    # 常用链接
    item_url = 'http://mall.plap.cn/commodities/{}/products_list'

    def __init__(self, batchNo=None, *args, **kwargs):
        super(FullSpider, self).__init__(batchNo=batchNo, *args, **kwargs)
        # 创建批次记录
        BatchDao().create_batch(self.batch_no)
        # 避免重复采集
        self.done_filter = DoneFilter(self.batch_no, coll_name='item_log_{}'.format(batchNo))

    def start_requests(self):
        settings = get_project_settings()
        cookies = SeleniumLogin().get_cookies(self.name)
        # cookies = {'_session_id': '91d56bf9a98151013b658c39c764f7d1'}
        if not cookies:
            self.logger.error('init cookie failed...')
            return
        self.logger.info('init cookie: %s', cookies)

        pool_list = SpuPoolDao().get_spu_pool_list(fields={}, query={'skuName': {'$exists': True}})
        self.logger.info('全量：%s' % (len(pool_list)))
        dist_list = [x for x in pool_list if not self.done_filter.contains(x.get('_id'))]
        self.logger.info('目标：%s' % (len(dist_list)))
        random.shuffle(dist_list)
        for spu in dist_list:
            spu_id = spu.get('_id')
            material_code = spu.get('materialCode')
            # 避免无效采集
            offline_time = spu.get('offlineTime', 0)
            if offline_time > settings.get('MAX_OFFLINE_TIME', 2):
                self.logger.info('忽略: [%s][%s]', spu_id, offline_time)
                continue
            if self.done_filter.contains(spu_id) and not settings.get('FORCE_RECOVER', False):
                self.logger.info('已采: [%s]', spu_id)
                continue

            # 采集商品价格及状态
            yield Request(
                url=self.item_url.format(spu_id),
                meta={
                    'reqType': 'item',
                    'batchNo': self.batch_no,
                    'spuId': spu_id,
                    'materialCode': material_code,
                    'spu': copy.copy(spu),
                },
                cookies=cookies,
                callback=self.parse_item_data,
                errback=self.error_back,
                priority=25,
            )

    # 处理ItemData
    def parse_item_data(self, response):
        meta = response.meta
        spu_id = meta.get('spuId')
        self.done_filter.put(spu_id)

        log = ItemDataLog()
        log['_id'] = spu_id
        log['spuId'] = spu_id
        log['batchNo'] = self.batch_no

        if '产品下架' in response.text or '您访问的页面没有找到' in response.text:
            log['status'] = 0
        else:
            data_list = parse_item_data(response)
            if data_list:
                self.logger.info('报价: spu=%s, cnt=%s' % (spu_id, len(data_list)))
                yield Box('item', self.batch_no, data_list)
            log['count'] = len(data_list)
            log['status'] = 1

        yield log
